This workbook guides you through the data processing stage for the Romania Kendo Stats visualization. This is a multi-stage process, you will need access to the raw data, Python, Excel installed. Any Python packages will also be installed on the way, but we recommend using the Anaconda distribution of Python. If you would like to edit the visualization part, then you will need PowerBI.
The general structure of the repository is the following:
In [3]:
import pandas as pd, numpy as np, json
import kendo_romania
Read data
In [376]:
members=pd.read_excel('rawdata/Evidenta membrilor.xlsm',header=[1])
In [377]:
members.head()
Out[377]:
In [378]:
members.columns
Out[378]:
In [379]:
members=members[[231, 'Nr. EKF',
'Club', 'Unnamed: 3',
'Numele', 'Prenumele',
'Gen', 'Data naşterii',
'1 kyu','practică',
'1 dan', '2 dan',
'3 dan', '4 dan',
'5 dan', '6 dan',
'7 dan', '8 dan']]
In [380]:
members.head()
Out[380]:
In [381]:
def get_transfer(name,tf,verbose=False):
if tf==[]:
return tf
else:
to_blank=[' ','(',')','Transfer:','?','FDS','/']
to_replace={'Hungary':'HUN'}
to_year={'Gușu Rebeca':'2010'}
def get_tf_clubs(z):
for t in range(len(to_blank)):
z=z.replace(to_blank[t],'')
for t in to_replace:
z=z.replace(t,to_replace[t])
if ('=>') in z:
from_to=z.find('=>')
to_return={'from':z[from_to-3:from_to],'to':z[from_to+2:from_to+5],'time':z[-4:]}
if verbose:
to_return['orig']=z
else:
print('error with transfer',z)
to_return=z
##check years
#infer year from wrong note order
if '20' not in to_return['time']:
if '20' in z:
to_return['time']=z[z.find('20'):z.find('20')+4]
#if still not inferred, then manual fix
if '20' not in to_return['time']:
to_return['time']=to_year[name]
to_return['time']=int(to_return['time'])
return to_return
transfers=str(tf).split('\n')
tfr=[]
for i in transfers:
if not i in ('','nan'):
tfr.append(get_tf_clubs(i))
return sorted(tfr, key=lambda k: k['time'])
In [382]:
def get_club_by_year(d,club,year):
if d==[]:
return [club]
else:
years={}
transfer_years=[mingrade]+[i['time'] for i in d]+[maxyear]
transfer_clubs=[d[0]['from']]+[i['to'] for i in d]
for i in range(1,len(transfer_years)):
for y in range(transfer_years[i-1],transfer_years[i]+1):
if y not in years:years[y]=[]
years[y].append(transfer_clubs[i-1])
return years[year]
In [350]:
def add_to_club(data,club,year,d):
if club not in data: data[club]={}
if year not in data[club]:data[club][year]=[]
data[club][year].append(d)
return data
In [370]:
pretty_clubs={'ARA':'Arashi', 'BDS':'Budoshin', 'BSD':'Bushido', 'BTK':'Bushi Tokukai', 'BG':'Bulgaria',
'CDO':'Coroan de Oțel', 'CRK':'Clubul Român de Kendo', 'HAR':'Hargita',
'ICH':'Ichimon', 'IKA':'Ikada','ISH':'Ishhin', 'IT':'Italy','HU':'Hungary','HUN':'Hungary',
'KAS':'Kashi', 'KNS':'Kenshin', 'KYO':'Kyobukan', 'MC':'Macedonia',
'SR':'Serbia', 'MN':'Montenegro', 'MOL':'Moldova', 'MUS':'Museido',
'RON':'Ronin-do', 'SAK':'Sakura', 'SAM':'Sam-sho','SAN':'Sankon', 'SBK':'Sobukan',
'SON':'Sonkei', 'SR':'Serbia', 'TAI':'Taiken', 'TR':'Turkey','ACKIJ':'ACKIJ',
'YUK':'Yu-kai','KAY':'Kaybukan'}
club_replacer={'':'KYO'}
In [371]:
data={}
for i in members.T.iteritems():
grades=i[1][['1 kyu','1 dan','2 dan','3 dan','4 dan','5 dan','6 dan','7 dan','8 dan']].dropna()
grades0=i[1][['1 dan','2 dan','3 dan','4 dan','5 dan','6 dan','7 dan','8 dan']].dropna()
df=pd.DataFrame(grades0)
df.columns=['dan']
df=df.reset_index().set_index('dan').sort_index()
#dummy={j:str(grades[j])[:10] for j in grades.index}
dummy={}
grades=pd.to_datetime(grades.astype(str))
if len(grades)>0:
mingrade=grades.min().year
maxgrade=grades.max().year
else:
mingrade=np.nan
maxgrade=np.nan
dummy['name']=i[1]['Numele']+' '+i[1]['Prenumele']
dummy['birth']=str(i[1]['Data naşterii'])[:10]
dummy['gen']=i[1]['Gen']
dummy['ekf']=i[1]['Nr. EKF']
dummy['active']=i[1][231]
club=i[1]['Club']
dummy['transfer']=get_transfer(dummy['name'],i[1]['Unnamed: 3'])
if np.isnan(mingrade):
mingrade=2016 #default starting year
maxyear=2019 #default max year
for year in range(mingrade,maxyear):
#get year from exams
dummy['dan']=len(df[:str
(year)])
#get club from transfers
clubs=get_club_by_year(dummy['transfer'],club,year)
clubs=clubs[:1] #remove this step to double count. this limits to first club in transfer years
for j in range(len(clubs)):
iclub=clubs[j]
if iclub in club_replacer: iclub=club_replacer[iclub]
dummy['club']=iclub
dummy['pretty_club']=pretty_clubs[iclub]
dummy['age']=year-1-pd.to_datetime(dummy['birth']).year
data=add_to_club(data,iclub,year,dummy.copy())
In [372]:
all_data=[]
for club in data:
for year in data[club]:
df=pd.DataFrame(data[club][year])
df['year']=year
df['club']=club
df=df.drop('transfer',axis=1)
all_data.append(df)
all_data=pd.concat(all_data)
In [374]:
all_data.to_csv('data/members_base.csv')
In [ ]: